STI Analysis

Author

Cody Appa

Published

May 4, 2023

Preamble

This project aims to allow the user to interactively look at infection rates of the most prevalent STI’s: Chlamydia, Gonorrhea, and Syphilis. By using this portfolio you will be able to mouse over a map of the united states for each infection and visualize data from the CDC on infection rates per county.

Data

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(scales))
library(tidyverse)
library(dplyr)
library(ggplot2)
library(readxl)
library(scales)
library(rnaturalearth)
library(rnaturalearthdata)

Attaching package: 'rnaturalearthdata'
The following object is masked from 'package:rnaturalearth':

    countries110
Code
library(sf)
Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
Code
library(tigris)
To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.
Code
suppressPackageStartupMessages(library(tigris))

#STIDictionary<-read_excel("STISheet.xlsx")
#knitr::kable(STIDictionary)
Code
data <- read.csv("Chlamydia - Rates of Reported Cases by County United States 2021 .csv")

Visualizations

Each of these visualizations is an interactive, spatial, heat-map of the United States. By mousing over individual counties it will show you the county name and infection rate.

Code
library(sf)
library(ggplot2)
library(dplyr)
library(ggiraph)

data <- read.csv("Chlamydia - Rates of Reported Cases by County United States 2021 .csv", header = TRUE)
data$Rate <- as.numeric(data$Rate)
Warning: NAs introduced by coercion
Code
invisible(suppressWarnings({
  us_counties <- tigris::counties(cb = TRUE, resolution = "20m", year = 2020, class = "sf", progress = FALSE)
}))



us_counties_contiguous <- us_counties %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

us_counties_data <- left_join(us_counties, data, by = c("NAMELSAD" = "County"))
Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 65 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Code
us_counties_data$Rate <- as.numeric(us_counties_data$Rate)

us_counties_data_contiguous <- us_counties_data %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

my_colors <- c('blue', 'purple', 'red', 'orange', 'yellow')


gg <- ggplot() +
  geom_sf_interactive(data = us_counties_data_contiguous, aes(fill = Rate, tooltip = paste(NAMELSAD, "<br>", "Rate:", Rate)), color = "grey", size = 0.1) +
  scale_fill_gradientn(colors = my_colors, na.value = "grey70", name = "Rate") + 
  labs(title = "Chlamydia Infection Rate by County 2021", caption = "Total infection rate by chlamydia by county") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        plot.caption = element_text(hjust = .5, size = 8, margin = margin(t = 10, r = 10)))


ggiraph(code = print(gg))
Function `ggiraph()` is replaced by `girafe()` and will be removed soon.
Code
library(sf)
library(ggplot2)
library(dplyr)
library(ggiraph)

data2 <- read.csv("Gonorrhea - Rates of Reported Cases by County United States 2021 .csv", header = TRUE)
data2$Rate <- as.numeric(data2$Rate)
Warning: NAs introduced by coercion
Code
invisible(suppressWarnings({
  us_counties <- tigris::counties(cb = TRUE, resolution = "20m", year = 2020, class = "sf", progress = FALSE)
}))


us_counties_contiguous <- us_counties %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

us_counties_data <- left_join(us_counties, data2, by = c("NAMELSAD" = "County"))
Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 65 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Code
us_counties_data$Rate <- as.numeric(us_counties_data$Rate)

us_counties_data_contiguous <- us_counties_data %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

my_colors <- c('blue', 'purple', 'red', 'orange', 'yellow')


gg <- ggplot() +
  geom_sf_interactive(data = us_counties_data_contiguous, aes(fill = Rate, tooltip = paste(NAMELSAD, "<br>", "Rate:", Rate)), color = "grey", size = 0.1) +
  scale_fill_gradientn(colors = my_colors, na.value = "grey70", name = "Rate") + 
  labs(title = "Gonorrhea Infection Rate by County 2021", caption = "Total infection rate by gonorrhea by county") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        plot.caption = element_text(hjust = .5, size = 8, margin = margin(t = 10, r = 10)))


ggiraph(code = print(gg))
Function `ggiraph()` is replaced by `girafe()` and will be removed soon.
Code
library(sf)
library(ggplot2)
library(dplyr)
library(ggiraph)

data3 <- read.csv("Primary and Secondary Syphilis - Rates of Reported Cases by County United States 2021 .csv", header = TRUE)
data3$Rate <- as.numeric(data3$Rate)
Warning: NAs introduced by coercion
Code
invisible(suppressWarnings({
  us_counties <- tigris::counties(cb = TRUE, resolution = "20m", year = 2020, class = "sf", progress = FALSE)
}))


us_counties_contiguous <- us_counties %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

us_counties_data <- left_join(us_counties, data3, by = c("NAMELSAD" = "County"))
Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 65 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Code
us_counties_data$Rate <- as.numeric(us_counties_data$Rate)

us_counties_data_contiguous <- us_counties_data %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

my_colors <- c('blue', 'purple', 'red', 'orange', 'yellow')


gg <- ggplot() +
  geom_sf_interactive(data = us_counties_data_contiguous, aes(fill = Rate, tooltip = paste(NAMELSAD, "<br>", "Rate:", Rate)), color = "grey", size = 0.1) +
  scale_fill_gradientn(colors = my_colors, na.value = "grey70", name = "Rate") + 
  labs(title = "Primary and Secondary Syphilis Infection Rate by County 2021", caption = "Total infection rate by primary and secondary Syphilis by county") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        plot.caption = element_text(hjust = .5, size = 8, margin = margin(t = 10, r = 10)))


ggiraph(code = print(gg))
Function `ggiraph()` is replaced by `girafe()` and will be removed soon.

Conclusion

Chlamydia has the highest rate of infection out of the three STI’s, though gonorrhea follows a similar pattern of infection rate by county. Syphilis, the lowest infection rate of the three seems to have a few hot spots but it’s hard to say if there is a pattern.